library(rlang)
library(readr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:rlang':
##
## set_names
#Preparation des données
library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.2 v purrr 1.0.1
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.2 v tibble 3.2.1
## v lubridate 1.9.2 v tidyr 1.3.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::%@%() masks rlang::%@%()
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x purrr::flatten() masks rlang::flatten()
## x purrr::flatten_chr() masks rlang::flatten_chr()
## x purrr::flatten_dbl() masks rlang::flatten_dbl()
## x purrr::flatten_int() masks rlang::flatten_int()
## x purrr::flatten_lgl() masks rlang::flatten_lgl()
## x purrr::flatten_raw() masks rlang::flatten_raw()
## x purrr::invoke() masks rlang::invoke()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names(), rlang::set_names()
## x purrr::splice() masks rlang::splice()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
netflix_titles <- read_csv("titles.csv", col_types = cols( type = col_character()))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
data_clean <- netflix_titles %>%
mutate(genres = str_extract(genres, "\\w+")) %>% mutate(production_countries = str_extract(production_countries, "\\w+"))
names(data_clean)
## [1] "id" "title" "type"
## [4] "description" "release_year" "age_certification"
## [7] "runtime" "genres" "production_countries"
## [10] "seasons" "imdb_id" "imdb_score"
## [13] "imdb_votes" "tmdb_popularity" "tmdb_score"
[1] “id” : Identifiant unique pour chaque entrée dans le dataset. [2] “title” : Titre du film ou de l’émission. [3] “type” : Type de contenu, comme “FILM” ou “ÉMISSION”. [4] “description” : Description du film ou de l’émission. [5] “release_year” : Année de sortie du film ou de l’émission. [6] “age_certification” : Certification d’âge pour le contenu, indiquant l’âge recommandé pour les spectateurs. [7] “runtime” : Durée en minutes du film ou de l’émission. [8] “genres” : Genres associés au film ou à l’émission. [9] “production_countries” : Pays de production du film ou de l’émission. [10] “seasons” : Nombre de saisons pour les séries télévisées. (NA si non applicable) [11] “imdb_id” : Identifiant IMDb du film ou de l’émission. [12] “imdb_score” : Score IMDb du film ou de l’émission. [13] “imdb_votes” : Nombre de votes IMDb pour le film ou l’émission. [14] “tmdb_popularity” : Popularité du film ou de l’émission sur TMDB (The Movie Database). [15] “tmdb_score” : Score TMDB du film ou de l’émission.
str(data_clean)
## tibble [6,138 x 15] (S3: tbl_df/tbl/data.frame)
## $ id : chr [1:6138] "ts300399" "tm82169" "tm17823" "tm191099" ...
## $ title : chr [1:6138] "Five Came Back: The Reference Films" "Rocky" "Grease" "The Sting" ...
## $ type : chr [1:6138] "SHOW" "MOVIE" "MOVIE" "MOVIE" ...
## $ description : chr [1:6138] "This collection includes 12 World War II-era propaganda films — many of which are graphic and offensive — discu"| __truncated__ "When world heavyweight boxing champion, Apollo Creed wants to give an unknown fighter a shot at the title as a "| __truncated__ "Australian good girl Sandy and greaser Danny fell in love over the summer. But when they unexpectedly discover "| __truncated__ "A novice con man teams up with an acknowledged master to avenge the murder of a mutual friend by pulling off th"| __truncated__ ...
## $ release_year : num [1:6138] 1945 1976 1978 1973 1979 ...
## $ age_certification : chr [1:6138] "TV-MA" "PG" "PG" "PG" ...
## $ runtime : num [1:6138] 51 119 110 129 119 91 109 30 94 120 ...
## $ genres : chr [1:6138] "documentation" "drama" "romance" "crime" ...
## $ production_countries: chr [1:6138] "US" "US" "US" "US" ...
## $ seasons : num [1:6138] 1 NA NA NA NA NA NA 4 NA NA ...
## $ imdb_id : chr [1:6138] NA "tt0075148" "tt0077631" "tt0070735" ...
## $ imdb_score : num [1:6138] NA 8.1 7.2 8.3 7.3 8.2 7.4 8.8 8 7.5 ...
## $ imdb_votes : num [1:6138] NA 588100 283316 266738 216307 ...
## $ tmdb_popularity : num [1:6138] 0.601 106.361 33.16 24.616 75.699 ...
## $ tmdb_score : num [1:6138] NA 7.78 7.41 8.02 7.25 ...
summary(data_clean)
## id title type description
## Length:6138 Length:6138 Length:6138 Length:6138
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## release_year age_certification runtime genres
## Min. :1945 Length:6138 Min. : 0.00 Length:6138
## 1st Qu.:2017 Class :character 1st Qu.: 44.00 Class :character
## Median :2019 Mode :character Median : 80.00 Mode :character
## Mean :2017 Mean : 76.37
## 3rd Qu.:2021 3rd Qu.:105.00
## Max. :2023 Max. :225.00
## NA's :2 NA's :1
## production_countries seasons imdb_id imdb_score
## Length:6138 Min. : 1.000 Length:6138 Min. :1.500
## Class :character 1st Qu.: 1.000 Class :character 1st Qu.:5.800
## Mode :character Median : 1.000 Mode :character Median :6.600
## Mean : 5.435 Mean :6.541
## 3rd Qu.: 2.000 3rd Qu.:7.300
## Max. :7677.000 Max. :9.600
## NA's :3832 NA's :469
## imdb_votes tmdb_popularity tmdb_score
## Min. : 5.0 Min. : 0.0094 Min. : 0.500
## 1st Qu.: 516.8 1st Qu.: 3.3805 1st Qu.: 6.000
## Median : 2093.5 Median : 7.5800 Median : 6.790
## Mean : 21152.7 Mean : 19.2687 Mean : 6.633
## 3rd Qu.: 8885.5 3rd Qu.: 16.5263 3rd Qu.: 7.400
## Max. :2684317.0 Max. :1078.6370 Max. :10.000
## NA's :486 NA's :78 NA's :254
View(data_clean)
##Nettoyage et transformation des données
library(tidyverse)
data_clean$genres = as.factor(data_clean$genres)
data_clean <- data_clean %>% select(-c(seasons,id,description,imdb_id,age_certification))
library(dplyr)
# Remove duplicate titles and keep only the unique ones
data_clean <- distinct(data_clean, title, .keep_all = TRUE)
data_clean <- na.omit(data_clean)
data_clean = as.data.frame(data_clean)
# Set "title" column as the index
row.names(data_clean) <- data_clean$title
data_clean = data_clean %>% select(-title)
data_clean$type = as.factor(data_clean$type)
data_clean$production_countries = as.factor(data_clean$production_countries)
#Analyse univarié
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p=ggplot(data=data_clean, aes(x=genres)) +geom_bar(stat="count", position=position_dodge(),fill= colorRampPalette(c("red", "steelblue"))(18),size=0.5 ,colour="black") + scale_fill_manual(values=c("#999999", "#E69F00"))+ggtitle("Nombre des film / shows par categorie ") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplotly(p)
=> Les 2 categories dominantes sont ” Drama ” et ” Comedy ”
dep.plot <- ggplot(data_clean, aes(type)) +
geom_bar(stat="count", position=position_dodge(),fill=c('red', 'steelblue'),size=0.5 ,colour="black")+ggtitle("Distibution des individus par type ") + geom_text(aes(label = ifelse(type == "SHOW", "Show", "MOVIE"), y = 1000), vjust = -1.5)
ggplotly(dep.plot)
=> Le dataset continet 3299 Film et 2042 Emission
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
c=highchart()
hec <- hchart(
density(data_clean$imdb_score),
type = "area", name = "IMDB score"
)%>%
hc_add_series(
density(data_clean$tmdb_score), type = "area",
color = "#B71C1C",
name = "TMDB score"
)
hec
numeric_indices <- c(2,3,6,7,8,9)
categorical_indices <- c(1, 4,5)
data_corr = cor(data_clean[,numeric_indices], use = "complete.obs")
hchart(data_corr, type = "heatmap", hcaes(x = colnames(data_clean), y = colnames(data_clean), value = data_corr))
#Analyse en composantes principales
library(FactoMineR)
res.pca=PCA(data_clean,quali.sup = categorical_indices,scale.unit = TRUE)
#Etude de l’inertie
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.pca$eig>1
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 TRUE TRUE TRUE
## comp 2 TRUE TRUE TRUE
## comp 3 TRUE TRUE TRUE
## comp 4 FALSE TRUE TRUE
## comp 5 FALSE TRUE TRUE
## comp 6 FALSE TRUE TRUE
fviz_screeplot(res.pca,addLabels=TRUE)
=> D’aprés la méthode de Kaiser , on va prendre en considération les dimensions avec les valeurs propres supérieures a 1 , donc dans notre cas , on va prendre la premiére, la 2éme et la 3éme dimension
=>Selon la méthode de Coude , on se limite a la dimension avec la chute , la plus conséquente et visible , or dans notre cas la variation entre les dimensions n’est pas brusque/importante
#analyse des variables
library(factoextra)
fviz_pca_var(res.pca,geom = c("arrow","text"),repel = TRUE,col.var = 'cos2')
fviz_pca_var(res.pca,geom = c("arrow","text"),repel = TRUE,col.var = 'contrib')
fviz_pca_var(res.pca,geom = c("arrow","text"),repel = TRUE,col.var = 'coord')
Dim.1 : Cette dimension est principalement caractérisée par les variables “release_year” (année de sortie), “runtime” (durée), “imdb_votes” (votes IMDb), “tmdb_popularity” (popularité TMDB) et “tmdb_score” (score TMDB). Elle semble capturer des aspects liés à la popularité, à la durée et à la réception des films. Des valeurs plus élevées sur cette dimension indiquent des films plus récents, plus longs, avec un plus grand nombre de votes sur IMDb, une plus grande popularité sur TMDB et des scores plus élevés sur TMDB. Cette dimension pourrait être associée au succès global ou à la reconnaissance des films.
Dim.2 : La deuxième dimension est principalement influencée par la variable “imdb_score” (score IMDb). Elle représente la qualité ou la note des films selon IMDb. Des valeurs plus élevées sur cette dimension indiquent des films avec des scores IMDb plus élevés, ce qui correspond à de meilleures évaluations globales. Cette dimension se concentre spécifiquement sur la qualité perçue des films selon les notes IMDb.
#Analyse des individus
fviz_pca_ind(res.pca,geom = c("point","text"),repel = TRUE,col.ind = 'cos2')
## Warning: ggrepel: 5332 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fviz_pca_ind(res.pca,geom = c("point","text"),repel = TRUE,col.ind = 'contrib')
## Warning: ggrepel: 5332 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fviz_pca_ind(res.pca,geom = c("point","text"),repel = TRUE,col.ind = 'coord')
## Warning: ggrepel: 5332 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fviz_pca_ind(res.pca, geom.ind = "point", col.ind = data_clean$type, addEllipses = TRUE,ellips.type="confidence",repel=TRUE ,
legend.title = "Group", title = "PCA - Individuals")
#Classification Hierarchique Ascendante
scaled_data = scale(data_clean[,numeric_indices],center=T)
d1=dist(scaled_data[1:500,],method="euclidean")
h1=hclust(d1,method="ward.D")
h1$method
## [1] "ward.D"
h1$order
## [1] 114 115 126 134 164 176 108 255 409 410 422 423 133 431 124 268 199 214
## [19] 191 206 244 89 91 188 192 497 447 238 461 121 334 196 211 162 171 112
## [37] 285 104 113 308 316 130 310 127 287 111 100 118 54 120 240 298 101 123
## [55] 82 236 208 325 215 348 216 197 272 227 291 292 223 242 311 249 251 253
## [73] 354 286 446 213 438 444 248 335 204 237 222 269 315 256 374 274 275 440
## [91] 495 469 402 496 303 190 221 455 293 267 281 284 386 404 353 376 318 427
## [109] 294 419 347 435 383 301 416 475 448 345 361 319 464 273 398 385 442 468
## [127] 470 349 418 463 352 414 467 332 392 363 425 493 283 390 143 342 366 389
## [145] 279 346 471 387 405 327 441 321 326 451 456 276 343 296 381 369 331 450
## [163] 443 483 397 485 250 429 388 362 413 306 486 488 399 411 230 403 373 382
## [181] 433 434 394 395 408 428 351 426 270 462 329 340 417 359 458 320 309 484
## [199] 368 421 384 436 437 407 491 489 460 492 314 452 478 481 333 271 482 358
## [217] 459 219 379 98 145 302 312 307 377 280 393 234 337 290 254 266 232 278
## [235] 220 257 233 245 262 225 228 263 96 239 243 246 224 235 341 264 259 317
## [253] 241 300 305 177 151 372 375 295 378 153 138 148 163 313 499 324 339 453
## [271] 449 371 465 479 328 415 380 391 457 288 364 260 323 146 258 396 477 304
## [289] 356 412 473 498 277 472 365 344 289 297 350 355 282 370 330 406 476 487
## [307] 140 159 430 474 432 454 338 420 445 480 494 322 400 401 466 336 357 424
## [325] 439 367 360 490 86 202 189 203 180 195 7 17 12 9 27 26 19 22
## [343] 24 31 23 11 16 25 14 10 30 15 34 21 32 53 29 81 28 33
## [361] 18 56 60 45 20 79 94 109 128 95 129 58 47 51 46 61 110 37
## [379] 42 167 65 80 41 49 75 77 63 13 57 172 179 160 174 102 117 116
## [397] 149 107 122 105 97 135 103 106 229 85 90 99 119 43 44 71 73 68
## [415] 170 62 141 161 132 136 137 152 158 125 142 173 166 70 147 157 169 139
## [433] 155 165 131 178 52 76 59 78 72 69 74 55 66 50 48 64 67 144
## [451] 150 154 168 156 175 84 185 187 500 181 182 87 200 205 265 226 299 92
## [469] 93 207 252 209 217 261 218 247 193 212 231 194 210 184 183 186 198 35
## [487] 88 83 201 3 6 39 38 40 2 4 5 8 1 36
h1$merge
## [,1] [,2]
## [1,] -309 -484
## [2,] -282 -370
## [3,] -409 -410
## [4,] -340 -417
## [5,] -250 -429
## [6,] -280 -393
## [7,] -267 -281
## [8,] -314 -452
## [9,] -362 -413
## [10,] -406 -476
## [11,] -288 -364
## [12,] -480 -494
## [13,] -443 -483
## [14,] -283 -390
## [15,] -336 -357
## [16,] -251 -253
## [17,] -412 -473
## [18,] -351 -426
## [19,] -408 -428
## [20,] -142 -173
## [21,] -418 -463
## [22,] -196 -211
## [23,] -284 -386
## [24,] -256 -374
## [25,] -486 -488
## [26,] -162 -171
## [27,] -320 1
## [28,] -148 -163
## [29,] -355 2
## [30,] -273 -398
## [31,] -270 -462
## [32,] -394 -395
## [33,] -436 -437
## [34,] 7 23
## [35,] -407 -491
## [36,] -37 -42
## [37,] -460 -492
## [38,] -306 25
## [39,] -399 -411
## [40,] -304 -356
## [41,] -59 -78
## [42,] -114 -115
## [43,] -391 -457
## [44,] -127 -287
## [45,] -363 -425
## [46,] -345 -361
## [47,] -493 14
## [48,] -295 -378
## [49,] -353 -376
## [50,] -248 -335
## [51,] -70 -147
## [52,] -65 -80
## [53,] -307 -377
## [54,] -232 -278
## [55,] -279 -346
## [56,] -385 -442
## [57,] -277 -472
## [58,] -228 -263
## [59,] -289 -297
## [60,] -274 -275
## [61,] -243 -246
## [62,] -126 -134
## [63,] -397 -485
## [64,] -319 -464
## [65,] -204 -237
## [66,] -466 15
## [67,] -152 -158
## [68,] -422 -423
## [69,] -478 -481
## [70,] -332 -392
## [71,] -99 -119
## [72,] -388 9
## [73,] -339 -453
## [74,] -125 20
## [75,] -368 -421
## [76,] -404 49
## [77,] -318 -427
## [78,] -190 -221
## [79,] -167 52
## [80,] -230 -403
## [81,] -359 -458
## [82,] -48 -64
## [83,] -371 -465
## [84,] -100 -118
## [85,] -199 -214
## [86,] -498 57
## [87,] -373 -382
## [88,] -94 -109
## [89,] 18 31
## [90,] -16 -25
## [91,] -467 70
## [92,] -434 32
## [93,] -13 -57
## [94,] -360 -490
## [95,] -238 -461
## [96,] -155 -165
## [97,] -215 -348
## [98,] -104 -113
## [99,] -116 -149
## [100,] -331 -450
## [101,] -56 -60
## [102,] -157 -169
## [103,] -260 -323
## [104,] -254 -266
## [105,] -164 -176
## [106,] -430 -474
## [107,] -138 28
## [108,] -124 -268
## [109,] -102 -117
## [110,] 24 60
## [111,] -338 -420
## [112,] -310 44
## [113,] -235 -341
## [114,] -349 21
## [115,] -344 59
## [116,] -107 -122
## [117,] -321 -326
## [118,] -396 -477
## [119,] -71 -73
## [120,] -156 -175
## [121,] -151 -372
## [122,] -29 -81
## [123,] -213 -438
## [124,] -448 46
## [125,] -291 -292
## [126,] -76 41
## [127,] 13 63
## [128,] -75 -77
## [129,] -41 -49
## [130,] -106 -229
## [131,] -300 -305
## [132,] -424 -439
## [133,] -20 -79
## [134,] -324 73
## [135,] -489 37
## [136,] -259 -317
## [137,] -95 -129
## [138,] -352 -414
## [139,] 27 75
## [140,] -46 -61
## [141,] -293 34
## [142,] -153 107
## [143,] 38 39
## [144,] -38 -40
## [145,] 45 47
## [146,] -172 -179
## [147,] 5 72
## [148,] -234 -337
## [149,] -327 -441
## [150,] -367 94
## [151,] -225 58
## [152,] -400 -401
## [153,] -222 -269
## [154,] -389 55
## [155,] -455 141
## [156,] -301 -416
## [157,] 62 105
## [158,] -206 -244
## [159,] -432 -454
## [160,] -10 -30
## [161,] -451 -456
## [162,] -264 136
## [163,] -132 -136
## [164,] -220 -257
## [165,] -15 -34
## [166,] -55 -66
## [167,] -224 113
## [168,] -242 -311
## [169,] -112 -285
## [170,] -325 97
## [171,] -308 -316
## [172,] 17 86
## [173,] -447 95
## [174,] -286 -446
## [175,] 19 89
## [176,] -499 134
## [177,] -207 -252
## [178,] -47 -51
## [179,] -144 -150
## [180,] -405 149
## [181,] -131 -178
## [182,] -329 4
## [183,] -137 67
## [184,] -177 121
## [185,] 3 68
## [186,] -296 -381
## [187,] -245 -262
## [188,] -330 10
## [189,] -271 -482
## [190,] -445 12
## [191,] -96 -239
## [192,] -444 50
## [193,] -365 115
## [194,] 99 116
## [195,] -9 -27
## [196,] -111 84
## [197,] 132 150
## [198,] -89 -91
## [199,] -240 -298
## [200,] -97 -135
## [201,] -43 -44
## [202,] -52 126
## [203,] -121 -334
## [204,] -249 16
## [205,] -101 -123
## [206,] -92 -93
## [207,] -369 100
## [208,] 8 69
## [209,] -166 51
## [210,] -50 82
## [211,] -205 -265
## [212,] 33 35
## [213,] -328 -415
## [214,] -294 -419
## [215,] 11 103
## [216,] -440 -495
## [217,] -18 101
## [218,] -475 124
## [219,] -186 -198
## [220,] -133 -431
## [221,] -103 130
## [222,] -168 120
## [223,] 30 56
## [224,] 76 77
## [225,] -189 -203
## [226,] -303 78
## [227,] 114 138
## [228,] -354 174
## [229,] 163 183
## [230,] -128 137
## [231,] 117 161
## [232,] 48 142
## [233,] -11 90
## [234,] -227 125
## [235,] -53 122
## [236,] -191 158
## [237,] -209 -217
## [238,] 65 153
## [239,] 106 159
## [240,] -497 173
## [241,] -194 -210
## [242,] -347 -435
## [243,] -350 29
## [244,] -226 -299
## [245,] -223 168
## [246,] 61 167
## [247,] -143 -342
## [248,] -82 -236
## [249,] -110 36
## [250,] -315 110
## [251,] 54 164
## [252,] -197 -272
## [253,] -258 118
## [254,] 109 194
## [255,] 42 157
## [256,] -313 176
## [257,] -146 253
## [258,] -62 -141
## [259,] -218 -247
## [260,] 98 171
## [261,] -68 -170
## [262,] 91 145
## [263,] 80 87
## [264,] -290 104
## [265,] 22 26
## [266,] -69 -74
## [267,] -140 -159
## [268,] -83 -201
## [269,] 40 172
## [270,] -154 222
## [271,] -449 83
## [272,] 169 260
## [273,] -21 -32
## [274,] -384 212
## [275,] -85 -90
## [276,] 64 223
## [277,] -276 -343
## [278,] -2 -4
## [279,] -468 -470
## [280,] -402 -496
## [281,] -233 187
## [282,] -105 200
## [283,] -433 92
## [284,] -63 93
## [285,] -241 131
## [286,] -54 -120
## [287,] -479 213
## [288,] -161 229
## [289,] -139 96
## [290,] -322 152
## [291,] 127 147
## [292,] 88 230
## [293,] -216 252
## [294,] 162 285
## [295,] -160 -174
## [296,] -6 -39
## [297,] -183 219
## [298,] -193 -212
## [299,] -387 180
## [300,] 6 148
## [301,] -28 -33
## [302,] -12 195
## [303,] -380 43
## [304,] -208 170
## [305,] -19 -22
## [306,] -383 156
## [307,] 111 190
## [308,] -188 -192
## [309,] -108 -255
## [310,] -72 266
## [311,] 151 191
## [312,] 81 139
## [313,] -366 154
## [314,] 140 249
## [315,] 175 182
## [316,] -130 112
## [317,] 85 236
## [318,] -302 -312
## [319,] 79 129
## [320,] 143 263
## [321,] -14 160
## [322,] 135 208
## [323,] 204 228
## [324,] 177 237
## [325,] 66 197
## [326,] 234 245
## [327,] 242 306
## [328,] 247 313
## [329,] 239 307
## [330,] 186 207
## [331,] 238 250
## [332,] 179 270
## [333,] 102 289
## [334,] -26 305
## [335,] 221 275
## [336,] -184 297
## [337,] -7 -17
## [338,] 272 316
## [339,] 185 220
## [340,] -5 -8
## [341,] -375 232
## [342,] -261 259
## [343,] -219 -379
## [344,] 181 202
## [345,] 203 265
## [346,] 166 210
## [347,] -45 133
## [348,] 211 244
## [349,] 193 243
## [350,] 146 295
## [351,] 144 278
## [352,] -487 267
## [353,] -88 268
## [354,] 123 192
## [355,] -84 -185
## [356,] 165 273
## [357,] 205 248
## [358,] -358 -459
## [359,] 108 317
## [360,] -471 299
## [361,] 256 271
## [362,] 235 301
## [363,] 155 224
## [364,] -58 178
## [365,] 71 201
## [366,] -98 -145
## [367,] 254 282
## [368,] 218 276
## [369,] 274 322
## [370,] 251 281
## [371,] 53 300
## [372,] 217 347
## [373,] 209 333
## [374,] -469 280
## [375,] -24 -31
## [376,] 258 288
## [377,] 196 286
## [378,] 287 303
## [379,] 198 308
## [380,] 206 324
## [381,] 314 319
## [382,] 227 262
## [383,] 128 284
## [384,] 309 339
## [385,] -231 241
## [386,] 290 325
## [387,] 296 351
## [388,] -1 -36
## [389,] 293 326
## [390,] 215 257
## [391,] 264 370
## [392,] 240 345
## [393,] 321 356
## [394,] 226 363
## [395,] -202 225
## [396,] 184 341
## [397,] 246 294
## [398,] 277 330
## [399,] 188 352
## [400,] 214 327
## [401,] -3 387
## [402,] 359 379
## [403,] 189 358
## [404,] 74 373
## [405,] 199 357
## [406,] 348 380
## [407,] 340 388
## [408,] 310 346
## [409,] 328 360
## [410,] 233 393
## [411,] 279 382
## [412,] 343 366
## [413,] 304 389
## [414,] 269 349
## [415,] 318 371
## [416,] 312 369
## [417,] 291 320
## [418,] 334 375
## [419,] -23 410
## [420,] 298 385
## [421,] 119 261
## [422,] 342 420
## [423,] 361 378
## [424,] 231 398
## [425,] 344 408
## [426,] 362 372
## [427,] 292 364
## [428,] 255 384
## [429,] 311 397
## [430,] 323 354
## [431,] -87 -200
## [432,] -35 353
## [433,] 216 374
## [434,] 315 416
## [435,] 338 377
## [436,] 283 434
## [437,] -187 -500
## [438,] 302 418
## [439,] 390 414
## [440,] 329 386
## [441,] -333 403
## [442,] -86 395
## [443,] 412 415
## [444,] -67 332
## [445,] 392 435
## [446,] 350 367
## [447,] 381 383
## [448,] 335 365
## [449,] 355 437
## [450,] 368 411
## [451,] 405 413
## [452,] 417 436
## [453,] 331 433
## [454,] 391 429
## [455,] 376 404
## [456,] 337 438
## [457,] 399 440
## [458,] 419 426
## [459,] 406 422
## [460,] 430 453
## [461,] 446 448
## [462,] 409 424
## [463,] 396 423
## [464,] 336 432
## [465,] 421 455
## [466,] -181 -182
## [467,] 439 457
## [468,] 401 407
## [469,] 451 460
## [470,] 425 444
## [471,] 394 400
## [472,] -180 -195
## [473,] 450 462
## [474,] 402 445
## [475,] 442 472
## [476,] 447 461
## [477,] 449 466
## [478,] 428 474
## [479,] 431 459
## [480,] 441 443
## [481,] 427 476
## [482,] 456 458
## [483,] 469 471
## [484,] 465 470
## [485,] 463 467
## [486,] 478 483
## [487,] 481 484
## [488,] 464 468
## [489,] 454 485
## [490,] 479 488
## [491,] 480 489
## [492,] 452 491
## [493,] 482 487
## [494,] 473 492
## [495,] 477 490
## [496,] 493 495
## [497,] 486 494
## [498,] 475 496
## [499,] 497 498
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
hcd <- as.dendrogram(h1)
hcd
## 'dendrogram' with 2 branches and 500 members total, at height 221.3302
plot(raise.dendrogram(hcd, 100), main = "Raised tree")
library(clValid)
## Loading required package: cluster
options(clValid.maxitems = 500)
subset_data <- scaled_data[1:500, ]
intern=clValid(subset_data,2:6,clmethods=c("hierarchical"),validation="internal")
summary(intern)
##
## Clustering Methods:
## hierarchical
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical Connectivity 4.2143 10.1143 12.7726 14.9810 18.2238
## Dunn 0.3661 0.2511 0.2511 0.2511 0.2511
## Silhouette 0.8236 0.7123 0.6794 0.6417 0.6393
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 4.2143 hierarchical 2
## Dunn 0.3661 hierarchical 2
## Silhouette 0.8236 hierarchical 2
=> Suite au tests de validation on peut choisir le nombre des cluster optimal = 2
##Ajout des classes au jeux de données et visualisation des cluster sur les plans de l’ACP
classes= cutree(h1,k=2)
classes
## Rocky
## 1
## Grease
## 1
## The Sting
## 1
## Rocky II
## 1
## Monty Python and the Holy Grail
## 1
## Animal House
## 1
## Monty Python's Flying Circus
## 1
## Life of Brian
## 1
## White Christmas
## 1
## Heroes
## 1
## Play Misty for Me
## 1
## Cairo Station
## 1
## Richard Pryor: Live in Concert
## 1
## Bandie
## 1
## Prince
## 1
## FTA
## 1
## Monty Python's Fliegender Zirkus
## 1
## Hitler: A Career
## 1
## Amrapali
## 1
## Alibaba Aur 40 Chor
## 1
## Manoranjan
## 1
## Professor
## 1
## Khoon Khoon
## 1
## Saladin the Victorious
## 1
## The Other Side of the Mountain
## 1
## Dark Waters
## 1
## The Blazing Sun
## 1
## The Return of the Prodigal Son
## 1
## Alexandria… Why?
## 1
## Salaakhen
## 1
## The Land
## 1
## Elaan
## 1
## Whispers
## 1
## Lal Patthar
## 1
## Seinfeld
## 1
## Top Gun
## 1
## Road House
## 1
## Rocky IV
## 1
## Labyrinth
## 1
## Rocky III
## 1
## Fletch
## 1
## The 'Burbs
## 1
## Rocky V
## 1
## Twins
## 1
## Out of Africa
## 1
## Parenthood
## 1
## Thomas & Friends
## 1
## Legal Eagles
## 1
## She's Gotta Have It
## 1
## Strange Voices
## 1
## Fireman Sam
## 1
## In Defense of a Married Man
## 1
## The Four Seasons
## 1
## Parrot Sketch Not Included: Twenty Years of Monty Python
## 2
## The Little Wars
## 1
## Mobile Suit Gundam III: Encounters in Space
## 1
## Monty Python Live at the Hollywood Bowl
## 1
## Danger Mouse
## 1
## Unspeakable Acts
## 1
## Mobile Suit Gundam II: Soldiers of Sorrow
## 1
## Mobile Suit Gundam: Char's Counterattack
## 1
## Agneepath
## 1
## Waiting for the Hearse
## 1
## The George McKenna Story
## 1
## The Ryan White Story
## 1
## Disco Dancer
## 1
## Dostana
## 1
## Mujrim
## 1
## Sohni Mahiwal
## 1
## Dil
## 1
## Pyar Ke Do Pal
## 1
## Jaal
## 1
## Duniya
## 1
## Aakhri Adaalat
## 1
## Quiet Victory: The Charlie Wedemeyer Story
## 1
## Alexandria, Again and Forever
## 1
## Adam: His Song Continues
## 1
## A Stoning in Fulham County
## 1
## Ek Jaan Hain Hum
## 1
## Too Young The Hero
## 1
## An Egyptian Story
## 1
## Survivor
## 2
## One Piece
## 1
## Pokémon
## 1
## Sleepless in Seattle
## 1
## Forrest Gump
## 1
## Reservoir Dogs
## 1
## Gilmore Girls
## 1
## Cowboy Bebop
## 2
## Galaxy Quest
## 1
## Neon Genesis Evangelion
## 2
## Jerry Maguire
## 1
## Notting Hill
## 1
## The Real World
## 1
## Power Rangers
## 1
## Big Daddy
## 2
## The Quick and the Dead
## 1
## Big Brother
## 2
## I Know What You Did Last Summer
## 1
## The Magic School Bus
## 2
## Inuyasha
## 2
## Kicking and Screaming
## 1
## Seven Years in Tibet
## 1
## Yu-Gi-Oh!
## 2
## Stepmom
## 1
## The Mask of Zorro
## 1
## Life
## 1
## Neon Genesis Evangelion: The End of Evangelion
## 2
## Adventures of Sonic the Hedgehog
## 1
## My Girl
## 1
## Kenan & Kel
## 2
## Girlfriends
## 2
## H
## 2
## Jerry Seinfeld: I'm Telling You for the Last Time
## 2
## The Last Days
## 2
## Croupier
## 1
## Heavy
## 1
## All That
## 2
## The Nutty Professor
## 1
## Heartbreak High
## 2
## The Wiggles
## 2
## Edge of Seventeen
## 1
## Cardcaptor Sakura
## 2
## Okupas
## 2
## Sankofa
## 1
## A Chinese Odyssey Part One: Pandora's Box
## 2
## The Parkers
## 2
## Moesha
## 1
## Sister, Sister
## 1
## Before the Flying Circus
## 2
## Maya Memsaab
## 1
## Dil Se..
## 1
## Herod's Law
## 2
## A Chinese Odyssey Part Two: Cinderella
## 2
## Blue Streak
## 1
## Kabhi Haan Kabhi Naa
## 1
## Bombay
## 1
## Phir Bhi Dil Hai Hindustani
## 2
## Chaahat
## 1
## Nutty Professor II: The Klumps
## 2
## Hum Aapke Hain Koun..!
## 1
## Wild Tango
## 1
## Kuch Kuch Hota Hai
## 2
## Yaar Gaddar
## 1
## Barney & Friends
## 2
## The Other
## 2
## Chamatkar
## 1
## Mann
## 2
## Chronicle of a Disappearance
## 1
## Oh Darling! Yeh Hai India!
## 1
## Duplicate
## 2
## Muthu
## 1
## Hum Saath Saath Hain
## 2
## Ram Jaane
## 1
## Anjaam
## 1
## English Babu Desi Mem
## 1
## Destiny
## 1
## Avvai Shanmugi
## 1
## Hello Brother
## 2
## Sinbad: Afros and Bellbottoms
## 1
## Damini
## 1
## Monty Python: Live at Aspen
## 2
## Jeans
## 2
## West Beirut
## 2
## Gumrah
## 1
## Shikari
## 1
## Out of Life
## 1
## Yodha
## 1
## Minsara Kanavu
## 1
## Dushmani
## 1
## Children of Shatila
## 2
## Sinbad: Nothin' but the Funk
## 1
## The Emigrant
## 1
## A Triumph of the Heart: The Ricky Bell Story
## 1
## Aashik Aawara
## 1
## The Trial of Adolf Eichmann
## 2
## Qila
## 2
## Nightmare in Columbia County
## 1
## Sinbad: Son of a Preacher Man
## 1
## Breaking Bad
## 1
## The Walking Dead
## 1
## Grey's Anatomy
## 1
## Arrested Development
## 1
## Community
## 1
## NCIS
## 1
## Avatar: The Last Airbender
## 1
## Supernatural
## 1
## Top Gear
## 2
## The Lord of the Rings: The Fellowship of the Ring
## 1
## The Great British Baking Show
## 2
## Friday Night Lights
## 2
## The IT Crowd
## 2
## Minority Report
## 1
## Scott Pilgrim vs. the World
## 1
## The Dark Knight
## 1
## The Amazing Race
## 2
## Heartland
## 2
## DEATH NOTE
## 1
## Monster
## 2
## The Hangover
## 1
## Naruto
## 1
## The Lord of the Rings: The Two Towers
## 1
## The Lord of the Rings: The Return of the King
## 1
## Mononoke
## 2
## Brokeback Mountain
## 1
## Chappelle's Show
## 2
## The Mist
## 1
## Ancient Aliens
## 2
## Road to Perdition
## 1
## Easy A
## 1
## The Mole
## 2
## The Pursuit of Happyness
## 1
## American Pickers
## 2
## Code Geass: Lelouch of the Rebellion
## 2
## Pawn Stars
## 2
## My Little Pony: Friendship Is Magic
## 2
## Closer
## 1
## The Aviator
## 1
## The Hills
## 2
## Burlesque
## 2
## Borgen
## 2
## Transformers: Prime
## 2
## iCarly
## 2
## Zathura: A Space Adventure
## 2
## The Longest Yard
## 2
## Ip Man
## 1
## Ned's Declassified School Survival Guide
## 2
## Secret Window
## 2
## A Knight's Tale
## 1
## Underworld
## 2
## Transformers: Revenge of the Fallen
## 1
## Julie & Julia
## 2
## Tears of the Sun
## 2
## Laguna Beach
## 2
## Along Came Polly
## 2
## The Fairly OddParents
## 2
## Midnight Diner
## 2
## The Staircase
## 2
## Resident Evil
## 2
## Hidden Passion
## 2
## G.I. Joe: The Rise of Cobra
## 2
## Victorious
## 2
## Flushed Away
## 2
## Trailer Park Boys
## 2
## Seabiscuit
## 2
## Monster House
## 2
## King Kong
## 1
## Total Drama Island
## 2
## Sonic X
## 2
## The Order of Myths
## 2
## Winx Club
## 2
## 21
## 1
## Zoey 101
## 2
## Spanglish
## 2
## Spirit: Stallion of the Cimarron
## 2
## kimi ni todoke -From Me to You-
## 2
## Leap Year
## 2
## Enough
## 2
## Gamer
## 2
## The Pink Panther
## 2
## 3 Idiots
## 1
## Gridiron Gang
## 2
## Resident Evil: Apocalypse
## 2
## Resident Evil: Afterlife
## 2
## Seven Pounds
## 1
## The Legend of Zorro
## 2
## Autumn's Concerto
## 2
## Still Game
## 2
## Black Butler
## 2
## Nora's Will
## 2
## An Elf's Story
## 2
## Boys Over Flowers
## 2
## Don
## 2
## Toradora!
## 2
## Ouran High School Host Club
## 2
## Rang De Basanti
## 2
## Daddy's Little Girls
## 2
## The Other Boleyn Girl
## 2
## Swades
## 2
## Bo on the Go!
## 2
## Trailer Park Boys: Say Goodnight to the Bad Guys
## 2
## New in Town
## 2
## Dev.D
## 2
## Mike Birbiglia: What I Should Have Said Was Nothing
## 2
## Code Lyoko
## 2
## The Game
## 2
## One on One
## 2
## Daddy Day Care
## 2
## You Will Meet a Tall Dark Stranger
## 2
## Eat Pray Love
## 2
## Nuevo Rico Nuevo Pobre
## 2
## H2O: Just Add Water
## 2
## Einsatzgruppen: The Nazi Death Squads
## 2
## The Fierce Wife
## 2
## Asoka
## 2
## A Wednesday!
## 2
## Chloe
## 2
## The Cartel
## 2
## Hachi: A Dog's Tale
## 1
## Monsters vs Aliens
## 2
## The Legend of Bruce Lee
## 2
## Basketball Wives
## 2
## Louis C.K.: Hilarious
## 2
## The Boy Who Cried Werewolf
## 2
## The Taking of Pelham 1 2 3
## 2
## Initial D
## 2
## The Garfield Show
## 2
## Kevin James: Sweat the Small Stuff
## 2
## Duck Season
## 2
## Half & Half
## 2
## Big Time Rush
## 2
## Pocoyo
## 2
## Love Aaj Kal
## 2
## Accident
## 2
## Black & White
## 2
## El Escamoso
## 2
## I Now Pronounce You Chuck & Larry
## 2
## Jim Gaffigan: Beyond the Pale
## 2
## Phir Hera Pheri
## 2
## Trailer Park Boys: The Movie
## 2
## The Legend of Bhagat Singh
## 2
## Anchor Baby
## 2
## RV
## 2
## Billu
## 2
## Masha and the Bear
## 2
## Madness in the Desert
## 2
## Guru
## 2
## Golmaal Returns
## 2
## One Piece: Strong World
## 2
## The Figurine: Araromire
## 2
## Jab We Met
## 2
## Jaane Tu... Ya Jaane Na
## 2
## Banyuki
## 2
## Kath & Kim
## 2
## Den-noh Coil
## 2
## Palermo Hollywood
## 2
## Johnny Test
## 2
## Falafel
## 2
## Ajab Prem Ki Ghazab Kahani
## 2
## Cairo 6,7,8
## 2
## Open Season
## 2
## Kabhi Khushi Kabhie Gham
## 2
## Like Stars on Earth
## 2
## Astro Boy
## 2
## Fashion
## 2
## Kal Ho Naa Ho
## 2
## Ben & Holly's Little Kingdom
## 2
## Octonauts
## 2
## Manorama Six Feet Under
## 2
## Open Season 2
## 2
## Trailer Park Boys: Countdown to Liquor Day
## 2
## Ishqiya
## 2
## Jim Gaffigan: King Baby
## 2
## Dinosaur King
## 2
## The Pink Panther 2
## 2
## Once a Gangster
## 2
## Fida
## 2
## Ijé: The Journey
## 2
## Comedian
## 2
## My Führer
## 2
## Raajneeti
## 2
## Salt of This Sea
## 2
## Wake Up Sid
## 2
## National Security
## 2
## Made of Honor
## 2
## Lagaan: Once Upon a Time in India
## 2
## Friends with Money
## 2
## Da Kath & Kim Code
## 2
## G.O.R.A.
## 2
## The Stepfather
## 2
## Paheli
## 2
## One 2 Ka 4
## 2
## Naruto the Movie: Legend of the Stone of Gelel
## 2
## Wakfu
## 2
## Kabhi Alvida Naa Kehna
## 2
## Kevin Hart: I'm a Grown Little Man
## 2
## Tayo the Little Bus
## 2
## Chalte Chalte
## 2
## Daddy Day Camp
## 2
## I Can Do Bad All By Myself
## 2
## Ip Man 2
## 2
## Naruto the Movie: Guardians of the Crescent Moon Kingdom
## 2
## Pororo the Little Penguin
## 2
## The Legend of the Nahuala
## 2
## Om Shanti Om
## 2
## Iris
## 2
## Jodhaa Akbar
## 2
## Beast Stalker
## 2
## Dil Chahta Hai
## 2
## Rock On!!
## 2
## I Hate Luv Storys
## 2
## Luck by Chance
## 2
## Chhota Bheem
## 2
## Naruto Shippuden the Movie: Bonds
## 2
## Naruto Shippuden the Movie: The Will of Fire
## 2
## Barbershop 2: Back in Business
## 2
## Legend of the Fist: The Return of Chen Zhen
## 2
## Main Hoon Na
## 2
## Bon Cop Bad Cop
## 2
## Look for a Star
## 2
## A Romantic Comedy
## 2
## Vientos de agua
## 2
## Naruto the Movie: Ninja Clash in the Land of Snow
## 2
## Fated to Love You
## 2
## Lakshya
## 2
## Kaal
## 2
## This Is the Life
## 2
## Sir! No Sir!
## 2
## Inuyasha the Movie: Affections Touching Across Time
## 2
## Inuyasha the Movie 4: Fire on the Mystic Island
## 2
## Arahan
## 2
## Waist Deep
## 2
## Connected
## 2
## Karthik Calling Karthik
## 2
## Dhan Dhana Dhan Goal
## 2
## The Prince Who Turns into a Frog
## 2
## Peepli Live
## 2
## Life in a Metro
## 2
## Monty Python's Personal Best
## 2
## The Kite
## 2
## Rabun
## 2
## Inuyasha the Movie 3: Swords of an Honorable Ruler
## 2
## Inuyasha the Movie 2: The Castle Beyond the Looking Glass
## 2
## Twins Mission
## 2
## Kaminey
## 2
## Love in a Puff
## 2
## Jeff Dunham: Arguing with Myself
## 2
## Jeff Dunham's Very Special Christmas Special
## 2
## Under the Bombs
## 2
## Pomegranates and Myrrh
## 2
## Tito
## 2
## A Love Story
## 2
## Naruto Shippuden the Movie
## 2
## Naruto Shippuden the Movie: The Lost Tower
## 2
## Zig and Sharko
## 2
## One Piece: The Desert Princess and the Pirates: Adventure in Alabasta
## 2
## One Piece: Episode of Chopper Plus: Bloom in the Winter, Miracle Cherry Blossom
## 2
## The Mafia Dolls
## 2
## The Magic Roundabout
## 2
## Kung Fu Panda Awesome Secrets
## 2
## Sivaji: The Boss
## 2
## Welcome
## 2
## The Unjust
## 2
## Oscar's Oasis 2
## 2
## Eternal Summer
## 2
## DreamWorks Shrek's Swamp Stories
## 2
## Vizontele
## 2
## Chup Chup Ke
## 2
## Awara Paagal Deewana
## 2
## Udaan
## 2
## Kannathil Muthamittal
## 2
## Africa United
## 2
## Delhi-6
## 2
## Honeymoon Travels Pvt. Ltd.
## 2
## Monty Python: Almost the Truth (The Lawyer's Cut)
## 2
## Anbe Sivam
## 2
## We Are Family
## 2
## Divine Intervention
## 2
## The Hospital
## 2
## Chhota Bheem Aur Krishna
## 2
## The Trailer Park Boys Xmas Special
## 2
## Mukhsin
## 2
## Taxi No. 9 2 11
## 2
## Yuva
## 2
## Ishq Vishk
## 2
## Booha
## 2
## Do Dooni Chaar
## 2
## Anukokunda Oka Roju
## 2
## George Lopez: Why You Crying?
## 2
## Talentime
## 2
## A Lion in the House
## 2
## The Hostage
## 2
## Taimour & Shafi'aa
## 2
## Ezra
## 2
## Parugu
## 2
## Katkout
## 2
## Soldier In the Camp
## 2
## The Ghost
## 2
## Kuselan
## 2
## Zozo
## 2
## A Natural Born Fool
## 2
## The Ultimatum
## 2
## My Amnesia Girl
## 2
## Private Alexandria
## 2
## Transit Prisoner
## 2
## Italia's War
## 2
## Te quiero
## 2
## The Great Fava Beans of China
## 2
## Soul boy
## 2
## About Love and Passion
## 2
## A Very Special Love
## 2
## Frank & Cindy
## 2
## The Island
## 2
## Bosta
## 2
## Pink Zone
## 2
## Chhota Bheem & Krishna: Pataliputra- City of the Dead
## 2
## Frontiers of Dreams and Fears
## 2
## Encrypted Letter
## 2
## Pravarakyudu
## 2
## Shameless
## 1
View(classes)
table(classes)
## classes
## 1 2
## 172 328
d.class=cbind.data.frame(scaled_data[1:500,],as.factor(classes))
View(d.class)
colnames(d.class)[7]="classes.cah"
catdes(d.class, num.var =7)
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## release_year 0.47038726 9.584151e-71
## imdb_votes 0.08513007 2.868563e-11
## runtime 0.04478746 1.801170e-06
## tmdb_popularity 0.03252361 4.996272e-05
##
## Description of each cluster by quantitative variables
## =====================================================
## $`1`
## v.test Mean in category Overall mean sd in category
## imdb_votes 6.517661 1.6409586 0.5849771 4.223044
## runtime 4.727467 0.8592783 0.4954530 1.128785
## tmdb_popularity 4.028559 0.4949743 0.1413928 2.305350
## release_year -15.320680 -3.9625761 -2.4827403 1.765473
## Overall sd p.value
## imdb_votes 2.620850 7.141223e-11
## runtime 1.244922 2.273384e-06
## tmdb_popularity 1.419768 5.611989e-05
## release_year 1.562475 5.563218e-53
##
## $`2`
## v.test Mean in category Overall mean sd in category
## release_year 15.320680 -1.70672893 -2.4827403 0.5800896
## tmdb_popularity -4.028559 -0.04402192 0.1413928 0.4311546
## runtime -4.727467 0.30466657 0.4954530 1.2603880
## imdb_votes -6.517661 0.03123074 0.5849771 0.4768679
## Overall sd p.value
## release_year 1.562475 5.563218e-53
## tmdb_popularity 1.419768 5.611989e-05
## runtime 1.244922 2.273384e-06
## imdb_votes 2.620850 7.141223e-11
library(factoextra)
fviz_cluster(object=list(data = scaled_data[1:500,], cluster = classes))
#Application du kmeans
kmeans=kmeans(scaled_data[1:500,],centers=2,iter.max = 1000)
##Ajout des clusters de kmeans au jeux des données
d.class$classes.kmeans= as.factor(kmeans$cluster)
res.cat1 = catdes(d.class,num.var =8)
res.cat1
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
## classes.cah 8.233625e-05 1
##
## Description of each cluster by the categories
## =============================================
## $`1`
## Cla/Mod Mod/Cla Global p.value v.test
## classes.cah=2 100.00000 66.66667 65.6 0.0001758575 3.751389
## classes.cah=1 95.34884 33.33333 34.4 0.0001758575 -3.751389
##
## $`2`
## Cla/Mod Mod/Cla Global p.value v.test
## classes.cah=1 4.651163 100 34.4 0.0001758575 3.751389
## classes.cah=2 0.000000 0 65.6 0.0001758575 -3.751389
##
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## imdb_votes 0.76056470 1.071825e-156
## tmdb_popularity 0.07727822 2.522807e-10
## imdb_score 0.04930540 5.290466e-07
## tmdb_score 0.03417148 3.195862e-05
##
## Description of each cluster by quantitative variables
## =====================================================
## $`1`
## v.test Mean in category Overall mean sd in category
## tmdb_score -4.129354 0.08111544 0.1032642 0.9306130
## imdb_score -4.960181 0.27116263 0.2985350 0.9491248
## tmdb_popularity -6.209817 0.09106493 0.1413928 1.2798909
## imdb_votes -19.481319 0.29352137 0.5849771 1.0910835
## Overall sd p.value
## tmdb_score 0.9396260 3.637833e-05
## imdb_score 0.9667259 7.042758e-07
## tmdb_popularity 1.4197684 5.304620e-10
## imdb_votes 2.6208498 1.581527e-84
##
## $`2`
## v.test Mean in category Overall mean sd in category
## imdb_votes 19.481319 18.509507 0.5849771 5.4384636
## tmdb_popularity 6.209817 3.236556 0.1413928 3.9374680
## imdb_score 4.960181 1.981938 0.2985350 0.3584530
## tmdb_score 4.129354 1.465413 0.1032642 0.1842568
## Overall sd p.value
## imdb_votes 2.6208498 1.581527e-84
## tmdb_popularity 1.4197684 5.304620e-10
## imdb_score 0.9667259 7.042758e-07
## tmdb_score 0.9396260 3.637833e-05
#Vérification des choix de nombre des clusters selon CAH et Kmeans
library(clValid)
options(clValid.maxitems = 500)
subset_data <- scaled_data[1:500, ]
intern1=clValid(scaled_data[1:500,],2:6,clMethods=c("hierarchical","kmeans"), validation= "internal")
summary(intern1)
##
## Clustering Methods:
## hierarchical kmeans
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical Connectivity 4.2143 10.1143 12.7726 14.9810 18.2238
## Dunn 0.3661 0.2511 0.2511 0.2511 0.2511
## Silhouette 0.8236 0.7123 0.6794 0.6417 0.6393
## kmeans Connectivity 8.7710 14.0627 41.9881 81.3266 84.5694
## Dunn 0.1825 0.1562 0.0647 0.0379 0.0379
## Silhouette 0.8067 0.6941 0.3607 0.3322 0.3289
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 4.2143 hierarchical 2
## Dunn 0.3661 hierarchical 2
## Silhouette 0.8236 hierarchical 2
#Supervised Learning : Random Forest Regression
scaled_data = as.data.frame(scaled_data)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
ind=createDataPartition(scaled_data$imdb_score, times = 1,p= 0.7,list=FALSE)
a=scaled_data[ind,]
t=scaled_data[-ind,]
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
model <- randomForest(imdb_score~release_year+imdb_votes+runtime, data = a)
#Evaluation selon le critere MSE
predictions <- predict(model, newdata = t)
MSE = mean((predictions - t$imdb_score)^2)
MSE
## [1] 0.7680208
write.csv(data_clean, file = "final_data.csv", row.names = TRUE)